###################################################
### Project: Issue Competition in Parliamentary ###
###          Speeches?                          ###
### Title:   Preprocession                      ###
### Author:  Christoph Ivanusch                 ###
###################################################

# Preparation

## clear global environment
rm(list = ls())

## load packages
library(quanteda)
library(readtext)
library(newsmap)
library(stringr)

## construct corpus

### load data set
data <- readRDS("Corp_Nationalrat_20021220_20191022.RDS")

### change date format as there are problems with splitting the texts into sentences otherwise
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?J(a|ä)n[[:alpha:]]*/?[[:blank:]]?", "1 1 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Feb[[:alpha:]]*/?[[:blank:]]?", "1 2 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?März*[[:blank:]]?", "1-3-")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Apr[[:alpha:]]*/?[[:blank:]]?", "1 4 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Mai[[:alpha:]]*/?[[:blank:]]?", "1 5 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Jun[[:alpha:]]*/?[[:blank:]]?", "1 6 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Jul[[:alpha:]]*/?[[:blank:]]?", "1 7 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Aug[[:alpha:]]*/?[[:blank:]]?", "1 8 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Sep[[:alpha:]]*/?[[:blank:]]?", "1 9 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Okt[[:alpha:]]*/?[[:blank:]]?", "1 10 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Nov[[:alpha:]]*/?[[:blank:]]?", "1 11 ")
data$text <- str_replace_all(data$text, "[[:digit:]]{1,2}[[:punct:]][[:blank:]]?Dez[[:alpha:]]*/?[[:blank:]]?", "1 12 ")

### change the title "Mag." to "Mag" as there are problems with splitting the texts into sentences otherwise
data$text <- str_replace_all(data$text, "Mag.", "Mag ")
data$text <- str_replace_all(data$text, "Ing.", "Ing ")

### change other abbreviations as there are problems with splitting the texts into sentences otherwise
data$text <- str_replace_all(data$text, "Abs.", "Abs ")
data$text <- str_replace_all(data$text, "Art.", "Art ")
data$text <- str_replace_all(data$text, "Nr.", "Nr ")

### change some strings as there are problems with splitting the texts into sentences otherwise
data$text <- str_replace_all(data$text, "[[:digit:]]{1,3}[[:punct:]][[:blank:]]?Jahr*", "1 Jahr") # e.g.: 21. Jahrhundert; in english: 21st century
data$text <- str_replace_all(data$text, "[[:digit:]]{1,3}[[:punct:]][[:blank:]]?Geburt*", "1 Geburt") # e.g.: 21. Geburtstag; in english: 21st birthday

### change some strings (words associated with parties) as they might distract classification otherwise
data$text <- str_replace_all(data$text, "Sozialdemokrat[[:digit:]]{0,5}", "SPÖ") # sentences with "Sozialdemokratie" in them might be classified incorrectly into issue "Arbeit_und_Soziales" just based on the fact, that the partie or its members is mentioned 
data$text <- str_replace_all(data$text, "Freiheitlich[[:digit:]]{0,5}", "FPÖ")
data$text <- str_replace_all(data$text, "Grüne[[:digit:]]{0,5}", "GRP")

### change some strings (direct addresses as stylistic/linguistic device) as they might distract classification otherwise
#### Explanation I: direct addresses are often used as a stylistic device at the start of or at the end of sentences as well as within longer sentences;
#### Explanation II: direct addresses used in this way deliver many false positives for the category "Begrueßung"
data$text <- str_replace_all(data$text, ",[[:blank:]]?meine[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren", " ") # captures and replaces phrase "meine Damen und Herren", if it is inserted within or at the end of a longer sentence
data$text <- str_replace_all(data$text, "Meine[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren,", " ") # captures and replaces phrase "meine Damen und Herren", if it is inserted at the start of a longer sentence
data$text <- str_replace_all(data$text, ",[[:blank:]]?[[:alpha:]]{0,}[[:blank:]]?[[:alpha:]]{0,}[[:blank:]]?ge[[:alpha:]]{0,}[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren", " ") # captures and replaces phrases such as "geschätzte Damen und Herren", "meine sehr geschätzten Damen und Herren" or "geehrte Damen und Herren", if they are inserted within or at the end of a longer sentence
data$text <- str_replace_all(data$text, "[Gg]e[[:alpha:]]{0,}[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren,", " ") # captures and replaces phrases such as "geschätzte Damen und Herren" or "geehrte Damen und Herren", if they are inserted at the start of a longer sentence
data$text <- str_replace_all(data$text, ",[[:blank:]]?[Gg]e[[:alpha:]]{0,}[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren", " ") # captures and replaces phrases such as "geschätzte Damen und Herren" or "geehrte Damen und Herren", if they are inserted in a longer sentence
data$text <- str_replace_all(data$text, "[Ss]ehr[[:blank:]]?ge[[:alpha:]]{0,}[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren,", " ") # captures and replaces phrases such as "sehr geschätzte Damen und Herren" or "sehr geehrte Damen und Herren", if they are inserted at the start of a longer sentence
data$text <- str_replace_all(data$text, ",[[:blank:]]?[Ss]ehr[[:blank:]]?ge[[:alpha:]]{0,}[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren", " ") # captures and replaces phrases such as "sehr geschätzte Damen und Herren" or "sehr geehrte Damen und Herren", if they are inserted in a longer sentence
data$text <- str_replace_all(data$text, ",[[:blank:]]?[[:alpha:]]{0,}[[:blank:]]?[[:alpha:]]{0,}[[:blank:]]?verehrt[[:alpha:]]{0,}[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren", " ") # captures and replaces phrases such as "verehrte Damen und Herren" or "meine sehr verehrten Damen und Herren", if they are inserted in a longer sentence
data$text <- str_replace_all(data$text, "[Vv]erehrt[[:alpha:]]{0,}[[:blank:]]?Damen[[:blank:]]?und[[:blank:]]?Herren,", " ") # captures and replaces phrase "Verehrte Damen und Herren" or "meine sehr verehrten Damen und Herren", if it is inserted at the start of a longer sentence

### create corpus object
corp <- corpus(data)
summary(corp, 5)

### change unit of texts to sentences
corp_sent <- corpus_reshape(corp, to = 'sentences')
summary(corp_sent, 10)

# create tokens
toks_1 <- tokens(corp_sent, remove_numbers = TRUE, remove_punct = TRUE)

toks_comp <- tokens_compound(toks_1, pattern = phrase(c('Hohes Haus', 'Europäische* Union')))

toks <- tokens_remove(toks_comp, pattern = stopwords("de"))

# construct dfm
dfmat <- dfm(toks, tolower = FALSE)

dfmat <- dfm_trim(dfmat, min_termfreq = 10)
print(dfmat)

# save corp, corp_sent and dfmat as RDA file
save(corp, corp_sent, dfmat, toks, file = "Preprocession_Data.RDA")

